################################################
############## Type I error rates ##############
## No differential variability and no outliers##
################################################

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 10000 CpGs and 100 arrays. Generate 1000 datasets

nprobes<-10000
narrays<-100
nsamples<-narrays/2
nsim <- 1

# Set up design matrix  	
geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
design<-model.matrix(~geno)


# Setting up the prior parameters of the model
d0<-20
s0<-0.8
mu0 <- -2
mu1 <- 2

# Set up empty vectors and matrices for results
sigma2<-rep(0,nprobes)
nde1<-nde2<-nde3<-nde4<-matrix(0,nsim,5)
colnames(nde1)<-colnames(nde2)<-colnames(nde3)<-colnames(nde4)<-c("Levene","LeveneT","Ftest","Bartlett","LeveneSQ")

for(i in 1:nsim){
# Sample the variances from a scaled inverse chisquare distribution
  sigma2<-d0*s0^2/(rchisq(nprobes,d0))
   
# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays,nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)

# Unmethylated CpGs
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = mu0),nprobes/2,narrays)
# Methylated CpGs
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean = mu1),nprobes/2,narrays)

# Add noise into the M value signal from generated variances  
  y<-sqrt(sigma2)*y
  
# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE)
 
# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
  
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays-2)
  top<-(narrays-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)

# Calculate the type I error rate for the different methods
  nde1[i,1]<-sum(fit$p.value[,2]<0.001) 
  nde1[i,2]<-sum(fitT$p.value[,2]<0.001)
  nde1[i,3]<-sum(FPval<0.001)
  nde1[i,4]<-sum(BartPval<0.001)
  nde1[i,5]<-sum(fitSQ$p.value[,2]<0.001)
 
  nde2[i,1]<-sum(fit$p.value[,2]<0.01) 
  nde2[i,2]<-sum(fitT$p.value[,2]<0.01)
  nde2[i,3]<-sum(FPval<0.01)
  nde2[i,4]<-sum(BartPval<0.01)
  nde2[i,5]<-sum(fitSQ$p.value[,2]<0.01)

  nde3[i,1]<-sum(fit$p.value[,2]<0.05) 
  nde3[i,2]<-sum(fitT$p.value[,2]<0.05)
  nde3[i,3]<-sum(FPval<0.05)
  nde3[i,4]<-sum(BartPval<0.05)
  nde3[i,5]<-sum(fitSQ$p.value[,2]<0.05)
  
  nde4[i,1]<-sum(fit$p.value[,2]<0.1) 
  nde4[i,2]<-sum(fitT$p.value[,2]<0.1)
  nde4[i,3]<-sum(FPval<0.1)
  nde4[i,4]<-sum(BartPval<0.1)
  nde4[i,5]<-sum(fitSQ$p.value[,2]<0.1)
 
}

######################################################################
############## Type I error rates ####################################
## No differential variability and no outliers########################
## Based on the normal samples from the Kidney cancer dataset ########
######################################################################

library(limma)
library(minfi)
library(MASS)
library(statmod)
# library(missMethyl)
source("/mnt/storage/shared/Rpackages/missMethyl/R/DiffVar.R")

# load the data
load("/mnt/storage/belinda/Differential variability/Mval.Rdata")

# A two group dataset with 10000 CpGs and 100 arrays. Sample 1000 datasets from normals in the kidney cancer dataset.

nprobes<-10000
narrays<-100
nsamples<-narrays/2
nsim <- 1000

# Set up design matrix  	
geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
design<-model.matrix(~geno)

# Set up empty vectors and matrices for results
sigma2<-rep(0,nprobes)
nde1<-nde2<-nde3<-nde4<-matrix(0,nsim,5)
colnames(nde1)<-colnames(nde2)<-colnames(nde3)<-colnames(nde4)<-c("Levene","LeveneT","Ftest","Bartlett","LeveneSQ")

for(i in 1:nsim){

# Randomly sample from the normals

  rcpg<-sample(1:nrow(Mval),nprobes)
  rsam<-sample(1:160,narrays)
  y<-Mval[rcpg,rsam]

# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE)
 
# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
  
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays-2)
  top<-(narrays-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)

# Calculate the type I error rate for the different methods
  nde1[i,1]<-sum(fit$p.value[,2]<0.001) 
  nde1[i,2]<-sum(fitT$p.value[,2]<0.001)
  nde1[i,3]<-sum(FPval<0.001)
  nde1[i,4]<-sum(BartPval<0.001)
  nde1[i,5]<-sum(fitSQ$p.value[,2]<0.001)
 
  nde2[i,1]<-sum(fit$p.value[,2]<0.01) 
  nde2[i,2]<-sum(fitT$p.value[,2]<0.01)
  nde2[i,3]<-sum(FPval<0.01)
  nde2[i,4]<-sum(BartPval<0.01)
  nde2[i,5]<-sum(fitSQ$p.value[,2]<0.01)

  nde3[i,1]<-sum(fit$p.value[,2]<0.05) 
  nde3[i,2]<-sum(fitT$p.value[,2]<0.05)
  nde3[i,3]<-sum(FPval<0.05)
  nde3[i,4]<-sum(BartPval<0.05)
  nde3[i,5]<-sum(fitSQ$p.value[,2]<0.05)
  
  nde4[i,1]<-sum(fit$p.value[,2]<0.1) 
  nde4[i,2]<-sum(fitT$p.value[,2]<0.1)
  nde4[i,3]<-sum(FPval<0.1)
  nde4[i,4]<-sum(BartPval<0.1)
  nde4[i,5]<-sum(fitSQ$p.value[,2]<0.1)
 
}
 



#########################################################
############# Robustness to outliers ####################
#########################################################

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 10000 CpGs and 100 arrays. Generate 1000 datasets

nprobes<-10000
narrays<-100
nsamples<-narrays/2
nsim <- 1000

# Set up design matrix  	
geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
design<-model.matrix(~geno)


# Setting up the prior parameters of the model
d0<-20
s0<-0.8
mu0 <- -2
mu1 <- 2

# Set up empty vectors and matrices for results
sigma2<-rep(0,nprobes)
nde1<-nde2<-nde3<-nde4<-matrix(0,nsim,5)
colnames(nde1)<-colnames(nde2)<-colnames(nde3)<-colnames(nde4)<-c("Levene","LeveneT","Ftest","Bartlett","LeveneSQ")
fdr.l<-fdr.ls<-fdr.f<-fdr.b<-pval.l<-pval.f<-pval.b<-matrix(0,ncol=500,nrow=nsim)

for(i in 1:nsim){
# Sample the variances from a scaled inverse chisquare distribution
  sigma2<-d0*s0^2/(rchisq(nprobes,d0))
   
# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays,nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)

# Unmethylated CpGs
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = mu0),nprobes/2,narrays)
# Methylated CpGs
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean = mu1),nprobes/2,narrays)

# Add noise into the M value signal from generated variances  
  y<-sqrt(sigma2)*y

# Add in 200 outliers  
  outliers <- sample (1:nprobes,200)
  y[outliers,(nsamples+1)]<-max(y)
  outstatus<-rep(0,nprobes)
  outstatus[outliers]<-1

# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE)
 
# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
  
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays-2)
  top<-(narrays-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)
  
# Calculate the type I error rate for the different methods
  nde1[i,1]<-sum(fit$p.value[,2]<0.001) 
  nde1[i,2]<-sum(fitT$p.value[,2]<0.001)
  nde1[i,3]<-sum(FPval<0.001)
  nde1[i,4]<-sum(BartPval<0.001)
  nde1[i,5]<-sum(fitSQ$p.value[,2]<0.001)

  nde2[i,1]<-sum(fit$p.value[,2]<0.01) 
  nde2[i,2]<-sum(fitT$p.value[,2]<0.01)
  nde2[i,3]<-sum(FPval<0.01)
  nde2[i,4]<-sum(BartPval<0.01)
  nde2[i,5]<-sum(fitSQ$p.value[,2]<0.01)
 
  nde3[i,1]<-sum(fit$p.value[,2]<0.05) 
  nde3[i,2]<-sum(fitT$p.value[,2]<0.05)
  nde3[i,3]<-sum(FPval<0.05)
  nde3[i,4]<-sum(BartPval<0.05)
  nde3[i,5]<-sum(fitSQ$p.value[,2]<0.05)

  nde4[i,1]<-sum(fit$p.value[,2]<0.1) 
  nde4[i,2]<-sum(fitT$p.value[,2]<0.1)
  nde4[i,3]<-sum(FPval<0.1)
  nde4[i,4]<-sum(BartPval<0.1)
  nde4[i,5]<-sum(fitSQ$p.value[,2]<0.1)
  
# Count discovered outliers
  o.l<-order(fit$p.value[,2])
  o.f<-order(FPval)
  o.b<-order(BartPval)
  o.ls <-order(fitSQ$p.value[,2])
  
  fdr.l[i,]<-cumsum(outstatus[o.l][1:500])
  fdr.f[i,]<-cumsum(outstatus[o.f][1:500])
  fdr.b[i,]<-cumsum(outstatus[o.b][1:500])
  fdr.ls[i,]<-cumsum(outstatus[o.ls][1:500])
 
}
  
######################################################
############## Differential variability ##############
######################################################

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 10000 CpGs and 100 arrays. Generate 1000 datasets

nprobes<-10000
narrays<-100
nsamples<-narrays/2
nsim <- 1000

# Set up design matrix
geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
design<-model.matrix(~geno)

# Set up prior parameters
p<-0.1
d0<-20
v0<-2
s0<-0.8

# Number of differentially variable CpGs
nde<-nprobes*p
de<-c(rep(1,nde),rep(0,nprobes-nde))

# Set up empty vectors and matrices to store results
sigma2<-matrix(0,ncol=2,nrow=nprobes)
estDE<-rep(0,nprobes)
fdr.l<-fdr.lt<-fdr.ls<-fdr.f<-fdr.b<-pval.l<-pval.f<-pval.b<-matrix(0,ncol=nde,nrow=nsim)

for(i in 1:nsim){
# Sample the variances from a scaled inverse chisquare distribution
  sigma2[,1]<-d0*s0^2/(rchisq(nprobes,d0))
  sigma2[,2]<-sigma2[,1]

# Sample the variances for the differentially variable CpGs to be roughly double in the second group compared to the first group
  sigma2[1:nde,2]<-1.5*d0/(rchisq(nde,d0))

# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays,nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)
# Unmethylated
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = -2),nprobes/2,narrays)
# Methylated  
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean= 2), nprobes/2,narrays)

# Add noise into the M value signal from generated variances  
  y[,1:nsamples]<-sqrt(sigma2[,1])*y[,1:nsamples]
  y[,(nsamples+1):narrays]<-sqrt(sigma2[,2])*y[,(nsamples+1):narrays]
  
# Add in 200 outliers  
  outliers <- sample (nde:nprobes,200)
  y[outliers,(nsamples+1)]<-max(y)
  outstatus<-rep(0,nprobes)
  outstatus[outliers]<-1
  
# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE) 
  fit$genes$DE <- fitT$genes$DE <- fitSQ$genes$DE <- de

# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
  
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays-2)
  top<-(narrays-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)
  
# Count false discoveries
  fp<-1-de
  o.l<-order(fit$p.value[,2])
  o.lt<-order(fitT$p.value[,2])
  o.ls<-order(fitSQ$p.value[,2])
  o.f<-order(FPval)
  o.b<-order(BartPval)
 
  fdr.l[i,]<-cumsum(fp[o.l][1:nde])
  fdr.lt[i,]<-cumsum(fp[o.lt][1:nde])
  fdr.ls[i,]<-cumsum(fp[o.ls][1:nde])
  fdr.f[i,]<-cumsum(fp[o.f][1:nde])
  fdr.b[i,]<-cumsum(fp[o.b][1:nde])
}

############################################
############ Sample size study #############
############################################

############ Group 2 twice as variable as group 1 ##############

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 50000 CpGs and varying numbers of arrays. Generate 1000 datasets at each sample size.

nprobes<-50000
narrays<-c(20,40,60,80,100,120,140,160,180,200)
nsim <- 1000

# Set up prior parameters
p<-0.1
d0<-20
v0<-2
s02<-0.8^2

# Number of differentially variable CpGs
nde<-nprobes*p
de<-c(rep(1,nde),rep(0,nprobes-nde))

# Set up empty vectors and matrices to store results
sigma2<-matrix(0,ncol=2,nrow=nprobes)
estDE<-rep(0,nprobes)
nde.l<-nde.lt<-nde.ls<-nde.f<-nde.b<-matrix(0,ncol=length(narrays),nrow=nsim)
tp.l<-tp.lt<-tp.ls<-tp.f<-tp.b<-matrix(0,ncol=length(narrays),nrow=nsim)


for(j in 1:length(narrays)){
for(i in 1:nsim){
# Set up design matrix
  nsamples<-narrays[j]/2
  geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
  design<-model.matrix(~geno)

# Sample the variances from a scaled inverse chisquare distribution  
  sigma2[,1]<-d0*s02/(rchisq(nprobes,d0))
  sigma2[,2]<-sigma2[,1]

# Sample the variances for the differentially variable CpGs to be roughly double in the second group compared to the first group
  sigma2[1:nde,2]<-1.5*d0/(rchisq(nde,d0))

# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays[j],nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)
# Unmethylated
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = -2),nprobes/2,narrays[j])
# Methylated  
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean= 2), nprobes/2,narrays[j])

# Add noise into the M value signal from generated variances  
  y[,1:nsamples]<-sqrt(sigma2[,1])*y[,1:nsamples]
  y[,(nsamples+1):narrays[j]]<-sqrt(sigma2[,2])*y[,(nsamples+1):narrays[j]]

# Add in 200 outliers    
  outliers <- sample (nde:nprobes,200)
  y[outliers,(nsamples+1)]<-max(y)
  outstatus<-rep(0,nprobes)
  outstatus[outliers]<-1
 
# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE) 
  fit$genes$DE <- fitT$genes$DE <- fitSQ$genes$DE <- de

# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
    
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays[j]-2)
  top<-(narrays[j]-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays[j]-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)

# Count numbers of false positives and true positives
  nde.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")<0.05)
  nde.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")<0.05)
  nde.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")<0.05)
  nde.f[i,j]<-sum(p.adjust(FPval,method="BH")<0.05)
  nde.b[i,j]<-sum(p.adjust(BartPval,method="BH")<0.05)
 
  tp.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")[1:nde]<0.05)
  tp.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")[1:nde]<0.05)
  tp.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")[1:nde]<0.05)
  tp.f[i,j]<-sum(p.adjust(FPval,method="BH")[1:nde]<0.05)
  tp.b[i,j]<-sum(p.adjust(BartPval,method="BH")[1:nde]<0.05)
 }
}
 

############ Group 2 5x as variable than group 1 ##############

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 50000 CpGs and varying numbers of arrays. Generate 1000 datasets at each sample size.

nprobes<-50000
narrays<-c(20,40,60,80,100,120,140,160,180,200)
nsim <- 1000

# Set up prior parameters
p<-0.1
d0<-20
v0<-2
s02<-0.8^2

# Number of differentially variable CpGs
nde<-nprobes*p
de<-c(rep(1,nde),rep(0,nprobes-nde))

# Set up empty vectors and matrices to store results
sigma2<-matrix(0,ncol=2,nrow=nprobes)
estDE<-rep(0,nprobes)
nde.l<-nde.lt<-nde.ls<-nde.f<-nde.b<-matrix(0,ncol=length(narrays),nrow=nsim)
tp.l<-tp.lt<-tp.ls<-tp.f<-tp.b<-matrix(0,ncol=length(narrays),nrow=nsim)


for(j in 1:length(narrays)){
cat("Sample size = ",j*10,"\n")
for(i in 1:nsim){
# Set up design matrix
  nsamples<-narrays[j]/2
  geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
  design<-model.matrix(~geno)

# Sample the variances from a scaled inverse chisquare distribution  
  sigma2[,1]<-d0*s02/(rchisq(nprobes,d0))
  sigma2[,2]<-sigma2[,1]

# Sample the variances for the differentially variable CpGs to be 5x more variable in the second group compared to the first group
  sigma2[1:nde,2]<-3.2*d0/(rchisq(nde,d0))

# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays[j],nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)
# Unmethylated
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = -2),nprobes/2,narrays[j])
# Methylated  
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean= 2), nprobes/2,narrays[j])

# Add noise into the M value signal from generated variances  
  y[,1:nsamples]<-sqrt(sigma2[,1])*y[,1:nsamples]
  y[,(nsamples+1):narrays[j]]<-sqrt(sigma2[,2])*y[,(nsamples+1):narrays[j]]

# Add in 200 outliers    
  outliers <- sample (nde:nprobes,200)
  y[outliers,(nsamples+1)]<-max(y)
  outstatus<-rep(0,nprobes)
  outstatus[outliers]<-1
 
# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE) 
  fit$genes$DE <- fitT$genes$DE <- fitSQ$genes$DE <- de

# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
    
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays[j]-2)
  top<-(narrays[j]-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays[j]-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)

# Count numbers of false positives and true positives
  nde.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")<0.05)
  nde.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")<0.05)
  nde.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")<0.05)
  nde.f[i,j]<-sum(p.adjust(FPval,method="BH")<0.05)
  nde.b[i,j]<-sum(p.adjust(BartPval,method="BH")<0.05)
 
  tp.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")[1:nde]<0.05)
  tp.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")[1:nde]<0.05)
  tp.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")[1:nde]<0.05)
  tp.f[i,j]<-sum(p.adjust(FPval,method="BH")[1:nde]<0.05)
  tp.b[i,j]<-sum(p.adjust(BartPval,method="BH")[1:nde]<0.05)
 }
}


################### Group 2 10x more variable than group 1 ##############

library(limma)
library(MASS)
library(statmod)
library(missMethyl)

# Simulate a two group dataset with 50000 CpGs and varying numbers of arrays. Generate 1000 datasets at each sample size.

nprobes<-50000
narrays<-c(20,40,60,80,100,120,140,160,180,200)
nsim <- 1000

# Set up prior parameters
p<-0.1
d0<-20
v0<-2
s02<-0.8^2

# Number of differentially variable CpGs
nde<-nprobes*p
de<-c(rep(1,nde),rep(0,nprobes-nde))

# Set up empty vectors and matrices to store results
sigma2<-matrix(0,ncol=2,nrow=nprobes)
estDE<-rep(0,nprobes)
nde.l<-nde.lt<-nde.ls<-nde.f<-nde.b<-matrix(0,ncol=length(narrays),nrow=nsim)
tp.l<-tp.lt<-tp.ls<-tp.f<-tp.b<-matrix(0,ncol=length(narrays),nrow=nsim)


for(j in 1:length(narrays)){
cat("Sample size = ",j*10,"\n")
for(i in 1:nsim){
# Set up design matrix
  nsamples<-narrays[j]/2
  geno<-factor(c(rep(1,nsamples),rep(2,nsamples)))
  design<-model.matrix(~geno)

# Sample the variances from a scaled inverse chisquare distribution  
  sigma2[,1]<-d0*s02/(rchisq(nprobes,d0))
  sigma2[,2]<-sigma2[,1]

# Sample the variances for the differentially variable CpGs to be 10x more variable in the second group compared to the first group
  sigma2[1:nde,2]<-6.4*d0/(rchisq(nde,d0))

# Generate M values from a mixture of two normal distributions
  y <- matrix(0,ncol=narrays[j],nrow=nprobes)
  index <- sample(1:nprobes,nprobes/2)
# Unmethylated
  y[index,] <- matrix(rnorm(nprobes*nsamples,mean = -2),nprobes/2,narrays[j])
# Methylated  
  y[-index,] <- matrix(rnorm(nprobes*nsamples,mean= 2), nprobes/2,narrays[j])

# Add noise into the M value signal from generated variances  
  y[,1:nsamples]<-sqrt(sigma2[,1])*y[,1:nsamples]
  y[,(nsamples+1):narrays[j]]<-sqrt(sigma2[,2])*y[,(nsamples+1):narrays[j]]

# Add in 200 outliers    
  outliers <- sample (nde:nprobes,200)
  y[outliers,(nsamples+1)]<-max(y)
  outstatus<-rep(0,nprobes)
  outstatus[outliers]<-1
 
# Test for differential variability with DiffVar with various settings turned on and off
  fit<-varFit(y,design,coef=2,trend=FALSE,robust=TRUE)
  fitT<-varFit(y,design,coef=2,trend=TRUE,robust=TRUE)
  fitSQ<-varFit(y,design,coef=2,type="SQ",trend=FALSE,robust=TRUE) 
  fit$genes$DE <- fitT$genes$DE <- fitSQ$genes$DE <- de

# Test for differential variability with the F and Bartlett test 
  group <- design[,2]
  i.group <- names(table(group))
  i.data1 <- y[,group==i.group[1]]
  i.data2 <- y[,group==i.group[2]]
  varnorm<-apply(i.data1,1,var)
  varcan<-apply(i.data2,1,var)
  Fstat<-varcan/varnorm
  FPval<-2*pmin(pf(Fstat,df1=ncol(i.data1)-1,df2=ncol(i.data2)-1,lower.tail=FALSE),pf(1/Fstat,df1=ncol(i.data2)-1,df2=ncol(i.data1)-1,lower.tail=FALSE))
    
  varp<-(nsamples-1)*(varnorm+varcan)/(narrays[j]-2)
  top<-(narrays[j]-2)*log(varp) - (nsamples-1)*(log(varcan)+log(varnorm))
  bottom<-1+2/3/(nsamples-1)-1/(narrays[j]-2)
  Bstat<-top/bottom
  BartPval<-pchisq(Bstat,df=1,lower.tail=FALSE)

# Count numbers of false positives and true positives
  nde.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")<0.05)
  nde.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")<0.05)
  nde.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")<0.05)
  nde.f[i,j]<-sum(p.adjust(FPval,method="BH")<0.05)
  nde.b[i,j]<-sum(p.adjust(BartPval,method="BH")<0.05)
 
  tp.l[i,j]<-sum(p.adjust(fit$p.value[,2],method="BH")[1:nde]<0.05)
  tp.lt[i,j]<-sum(p.adjust(fitT$p.value[,2],method="BH")[1:nde]<0.05)
  tp.ls[i,j]<-sum(p.adjust(fitSQ$p.value[,2],method="BH")[1:nde]<0.05)
  tp.f[i,j]<-sum(p.adjust(FPval,method="BH")[1:nde]<0.05)
  tp.b[i,j]<-sum(p.adjust(BartPval,method="BH")[1:nde]<0.05)
 }
}
